### Parameter


In [2]:
grid1 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}
grid2 = {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}
grid3 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}


In [3]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix, remove_outliers

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

import xgboost as xgb
from xgboost import XGBClassifier

from pickle import dump , load

import warnings

2024-12-08 13:43:03.611456: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-08 13:43:03.698524: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733665383.730998   32201 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733665383.739369   32201 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-08 13:43:03.817875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
test_file_path = '../TestDatasetExample.xls'

### Load data

In [4]:
X = pd.read_excel(test_file_path)

X.replace(999, np.nan, inplace=True)

NUM_OF_SELECTED_FEATURES = "corr_25"

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = X[selected_features]
print('Loaded selected_features to X')

Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature
Loaded selected_features to X


### Load model

In [5]:
model = XGBClassifier()
model.load_model("model.ubj")

print(selected_features)
y_pred = model.predict(X)

['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']


In [6]:
y_pred

array([0, 1, 1])

### Retrain the model with different data and evaluate the model

In [9]:
model = XGBClassifier()

NUM_OF_SELECTED_FEATURES = "corr_25"

data = pd.read_excel("../TrainDataset2024.xls")
data.replace(999, np.nan, inplace=True)

data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = data[selected_features]
y = data["pCR (outcome)"]
print(X.shape, y.shape)

rs = 10
while True:  
    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=46) # similar distribution of 1 and 0
    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=rs)

    X_train_full.reset_index(drop=True, inplace=True)
    X_test_reserved.reset_index(drop=True, inplace=True)
    y_train_full.reset_index(drop=True, inplace=True)
    y_test_reserved.reset_index(drop=True, inplace=True)

    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)
    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)

    if abs(ratio_train - ratio_test) < 0.01:
        print(f"Split data using train_test_split with random_state={rs}")
        break
    rs+=1

print("Splited the data into train and test. The test will not be used in the training, but just for test the xgb. ")
print(f"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. ")
print(f"Positive ratio: \n\tTrain: {ratio_train:.5f}\n\tTest: {ratio_test:.5f}")

# stratified_kfold = StratifiedKFold(n_splits=5, shuffle=False)
rs = 13
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

model.set_params(**grid)

print(f"\nCross validation for the train set using StratifiedKFold with random_state={rs}: {X_train_full.shape}")

y_pred_cv = cross_val_predict(model, X_train_full, y_train_full, cv=stratified_kfold)
print(confusion_matrix(y_train_full, y_pred_cv))
print(classification_report(y_train_full, y_pred_cv))
print(f"Balanced accuracy score: {balanced_accuracy_score(y_train_full, y_pred_cv)}")
print(f"F1 Score: {f1_score(y_train_full, y_pred_cv)}")
print(f"Precision: {precision_score(y_train_full, y_pred_cv)}")
print(f"Recall: {recall_score(y_train_full, y_pred_cv)}")
print(f"Specificity: {recall_score(y_train_full, y_pred_cv, pos_label=0)}")
print()

model.fit(X_train_full, y_train_full)
y_pred = model.predict(X_test_reserved)

print(f"\nResult of the test set: {X_test_reserved.shape}")

print(confusion_matrix(y_test_reserved, y_pred))
print(classification_report(y_test_reserved, y_pred))
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}")
print(f"F1 Score: {f1_score(y_test_reserved, y_pred)}")
print(f"Precision: {precision_score(y_test_reserved, y_pred)}")
print(f"Recall: {recall_score(y_test_reserved, y_pred)}")
print(f"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}")


print("\nUse the whole data to train and do CV using StratifiedKFold with random_state={rs}")
y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)
print(confusion_matrix(y, y_pred_cv))
print(classification_report(y, y_pred_cv))
print(f"Balanced accuracy score: {balanced_accuracy_score(y, y_pred_cv)}")
print(f"F1 Score: {f1_score(y, y_pred_cv)}")
print(f"Precision: {precision_score(y, y_pred_cv)}")
print(f"Recall: {recall_score(y, y_pred_cv)}")
print(f"Specificity: {recall_score(y, y_pred_cv, pos_label=0)}")
print()


print(f"Predict the test file:")

X = pd.read_excel(test_file_path)

X.replace(999, np.nan, inplace=True)

NUM_OF_SELECTED_FEATURES = "corr_25"

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = X[selected_features]
y_pred = model.predict(X)

print(y_pred)

Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature
(395, 25) (395,)
Split data using train_test_split with random_state=14
Splited the data into train and test. The test will not be used in the training, but just for test the xgb. 
The training data has 316 data. The testing data has 79 data. 
Positive ratio: 
	Train: 0.21203
	Test: 0.21519


NameError: name 'grid' is not defined

In [None]:

NUM_OF_SELECTED_FEATURES = "corr_25"

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")


files = [("../train_data.xls", "../test_data.xls"), ("../train_data_2.xls", "../test_data_2.xls"), ("../train_data_3.xls", "../test_data_3.xls")]

ba = []

for index, (train_file, test_file) in enumerate(files):    
    data = pd.read_excel(train_file)
    data.replace(999, np.nan, inplace=True)

    data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
    data.dropna(subset=["pCR (outcome)"], inplace=True)

    X = data.drop(columns='pCR (outcome)', axis=1)
    X = X[selected_features]
    y = data["pCR (outcome)"]
    # print(X.shape, y.shape)

    testdata = pd.read_excel(test_file)
    testdata.replace(999, np.nan, inplace=True)

    testdata.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
    testdata.dropna(subset=["pCR (outcome)"], inplace=True)

    X_test = testdata.drop(columns='pCR (outcome)', axis=1)
    X_test = X_test[selected_features]
    y_test = testdata["pCR (outcome)"]
    # print(X_test.shape, y_test.shape)

    model1 = XGBClassifier()
    model1.set_params(**grid1)
    model2 = XGBClassifier()
    model2.set_params(**grid2)
    model3 = XGBClassifier()
    model3.set_params(**grid3)

    model1.fit(X, y)
    model2.fit(X, y)
    model3.fit(X, y)

    y_pred = []
    y_pred.append(model1.predict(X_test))
    y_pred.append(model2.predict(X_test))
    y_pred.append(model3.predict(X_test))
    y_pred = np.array(y_pred)

    yp = np.round(np.average(y_pred, axis=0))
    print(f"File {index}")
    print(confusion_matrix(y_test, yp))
    ba.append(balanced_accuracy_score(y_test, yp))
    print(ba[-1])
print(f"Averaged balanced accuracy: {np.mean(ba)}")


Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature
File 0
[[44 18]
 [ 6 11]]
0.6783681214421253
File 1
[[38 24]
 [ 1 16]]
0.7770398481973435
File 2
[[42 20]
 [ 5 12]]
0.6916508538899431
Averaged balanced accuracy: 0.715686274509804
